contribution

All the group member participated in all the two assignments, and after discussion, formed this report.

Assignment 1

Please see the resulting picture below:

Figure
Figure

Assignment 2

Question 1

#read the data from the txt file
senic <- read.table("SENIC.txt", header = FALSE)
names(senic) <- c("ID",
                  "Length of Stay",
                  "Age",
                  "Infection Risk",
                  "Routine Culturing Ratio",
                  "Routine Chest X-ray Ratio",
                  "Number of Beds",
                  "Medical School Affiliation",
                  "Region",
                  "Average Daily Census",
                  "Number of Nurses",
                  "Available Facilities & Services")

Question 2

Q22_function <- function(input_vector){
  Q1 <- quantile(input_vector, 0.25)
  Q3 <- quantile(input_vector, 0.75)
  threshold <- 1.5 * (Q3 - Q1)
  outliers_indices <- which(input_vector > (Q3 + threshold) | input_vector < (Q1 - threshold))
  return(outliers_indices)

}

Question 3

Infection_risk_outliers_indecies <-  Q22_function(senic$`Infection Risk`)
outlier_data <- senic[Infection_risk_outliers_indecies,]

plot23 <- ggplot(data=senic)+ 
  geom_density(aes(x=`Infection Risk`))+
  geom_point(data=outlier_data,aes(x=`Infection Risk`),y=0,shape = 5,col="blue")
plot23

By observing the graph, it can be noticed that the slope is steeper on the right-hand side compared to the left-hand side. In terms of the outliers, three of them are located at higher values, and the other two are at lower values.

Question 4

quantitative_variables <- c("Length of Stay",
                "Age",
                "Infection Risk",
                "Routine Culturing Ratio",
                "Routine Chest X-ray Ratio",
                "Number of Beds",
                "Average Daily Census",
                "Number of Nurses",
                "Available Facilities & Services")

plot <- function(name){
  outliers <- senic[Q22_function(senic[,name]),name]
  plotdata <- senic[,name]
  density_plot <- ggplot()+
    geom_density(data = senic,aes(x=plotdata))+
    xlab(name)
  
  if(length(outliers)>0){ #This is to ensure when there is no outlier, the plot will still work
    density_plot <- density_plot+
      geom_point(aes(x=outliers),y=0,shape = 5,col="blue")
  }
  return(density_plot)
}
plot_list <- lapply(quantitative_variables,plot)
grid_plot <- grid.arrange(grobs=plot_list,
             top=("Density Plot of All Quantitative Variables"))

By observing the above graph, it can be noticed that “Length of Stay”,“Routine Culturing Ratio”,“Number of Beds”,“Average Daily Census” and “Number of Nurses” are right skewed, also their outliers appears at higher value.
On the other hand “Age”, “Infection Risk”,“Routine Chese X-ray Ratio” and “Available Facilities & Service” are close to symmetric, where the “Available Facilities & Service” does not contain outliers.

Question 5

plotQ2_5 <-  ggplot(data=senic)+
  geom_point(aes(x=`Infection Risk`, y= `Number of Nurses`,color=`Number of Beds`))
print(plotQ2_5)

Compared to the graph in step 4, the above graph can let one investigate the correlation between two variables (In this case, Infection Risk and Number of Nurses).
In terms of the color scale, there are a few potential problems with it. First, the shade of the color blue may not be easily distinguishable by the human eye. Second, while adding a color scale is a way to expand the dimensions of the graph, the correlation it represents might not be intuitive for the reader.

Question 6

ggplotly(plot23)

As the above graph shown, the graph made by ggplot2 can directly be constructed into a plotly graph. Compared to the previous graph, the plotly creates an interactive graph. We can use different feature buttons to control this graph. In addition, we can directly hover the cursor on the graph to display the current value.

Question7

outliers <- senic$`Infection Risk`[Q22_function(senic$`Infection Risk`)]
figure <- senic %>% mutate(ifoutliers=is.element(senic$`Infection Risk`,outliers)) %>% 
  plot_ly(x = ~`Infection Risk`, type="histogram")%>%
  add_trace(x = ~`Infection Risk`[ifoutliers], y = 0,type="scatter",
            mode="markers",marker=list(symbol="diamond"))%>%
  layout(title="The histogram of Infection Risk and its outliers")

figure

Question8

Q28ui <- fluidPage(
  checkboxGroupInput(inputId = "variables",label = "Choose Variables", c(quantitative_variables)),
  sliderInput(inputId="bw", label="Choose bandwidth size", value=0.2,min=0.02, max=10.0),
  plotOutput("densPlot")
)

Q28server <- function(input, output) {
  
  output$densPlot <- renderPlot({
    
    validate(need(input$variables, "Please select variables.")) 
    plotQ28 <- function(name){
      outliers <- senic[Q22_function(senic[,name]),name]
      plotdata <- senic[,name]
      density_plot <- ggplot()+
        geom_density(data = senic,aes(x=plotdata),bw=input$bw)+
        xlab(name)
      
      if(length(outliers)>0){
        density_plot <- density_plot+
          geom_point(aes(x=outliers),y=0,shape = 5,col="blue")
      }
      return(density_plot)
    }
    
    plot_list<-lapply(input$variables,plotQ28)
    grid.arrange(grobs=plot_list,top=("density plot of Variables"))
  })
  
}

shinyApp(ui=Q28ui, server=Q28server)
Shiny applications not supported in static R Markdown documents

The curve will become smoother when the bandwidth value increase and vice versa.
However, there is no such bandwidth value that is optimal to all of the variables. Since every variable span on different range. If increasing the bandwidth, the variable that span in larger range might have a better and smoother plot, but the variable that span in smaller range will become too smooth and lose important information. For example, setting the bandwidth to 10 for ‘Infection Risk’ can demonstrate this issue.

Appendix

knitr::opts_chunk$set(echo = TRUE)
rm(list = ls())
library(ggplot2)
library(gridExtra)
library(plotly)
library(shiny)
#read the data from the txt file
senic <- read.table("SENIC.txt", header = FALSE)
names(senic) <- c("ID",
                  "Length of Stay",
                  "Age",
                  "Infection Risk",
                  "Routine Culturing Ratio",
                  "Routine Chest X-ray Ratio",
                  "Number of Beds",
                  "Medical School Affiliation",
                  "Region",
                  "Average Daily Census",
                  "Number of Nurses",
                  "Available Facilities & Services")

Q22_function <- function(input_vector){
  Q1 <- quantile(input_vector, 0.25)
  Q3 <- quantile(input_vector, 0.75)
  threshold <- 1.5 * (Q3 - Q1)
  outliers_indices <- which(input_vector > (Q3 + threshold) | input_vector < (Q1 - threshold))
  return(outliers_indices)

}
Infection_risk_outliers_indecies <-  Q22_function(senic$`Infection Risk`)
outlier_data <- senic[Infection_risk_outliers_indecies,]

plot23 <- ggplot(data=senic)+ 
  geom_density(aes(x=`Infection Risk`))+
  geom_point(data=outlier_data,aes(x=`Infection Risk`),y=0,shape = 5,col="blue")
plot23
quantitative_variables <- c("Length of Stay",
                "Age",
                "Infection Risk",
                "Routine Culturing Ratio",
                "Routine Chest X-ray Ratio",
                "Number of Beds",
                "Average Daily Census",
                "Number of Nurses",
                "Available Facilities & Services")

plot <- function(name){
  outliers <- senic[Q22_function(senic[,name]),name]
  plotdata <- senic[,name]
  density_plot <- ggplot()+
    geom_density(data = senic,aes(x=plotdata))+
    xlab(name)
  
  if(length(outliers)>0){ #This is to ensure when there is no outlier, the plot will still work
    density_plot <- density_plot+
      geom_point(aes(x=outliers),y=0,shape = 5,col="blue")
  }
  return(density_plot)
}
plot_list <- lapply(quantitative_variables,plot)
grid_plot <- grid.arrange(grobs=plot_list,
             top=("Density Plot of All Quantitative Variables"))


plotQ2_5 <-  ggplot(data=senic)+
  geom_point(aes(x=`Infection Risk`, y= `Number of Nurses`,color=`Number of Beds`))
print(plotQ2_5)
 

ggplotly(plot23)

outliers <- senic$`Infection Risk`[Q22_function(senic$`Infection Risk`)]
figure <- senic %>% mutate(ifoutliers=is.element(senic$`Infection Risk`,outliers)) %>% 
  plot_ly(x = ~`Infection Risk`, type="histogram")%>%
  add_trace(x = ~`Infection Risk`[ifoutliers], y = 0,type="scatter",
            mode="markers",marker=list(symbol="diamond"))%>%
  layout(title="The histogram of Infection Risk and its outliers")

figure



Q28ui <- fluidPage(
  checkboxGroupInput(inputId = "variables",label = "Choose Variables", c(quantitative_variables)),
  sliderInput(inputId="bw", label="Choose bandwidth size", value=0.2,min=0.02, max=10.0),
  plotOutput("densPlot")
)

Q28server <- function(input, output) {
  
  output$densPlot <- renderPlot({
    
    validate(need(input$variables, "Please select variables.")) 
    plotQ28 <- function(name){
      outliers <- senic[Q22_function(senic[,name]),name]
      plotdata <- senic[,name]
      density_plot <- ggplot()+
        geom_density(data = senic,aes(x=plotdata),bw=input$bw)+
        xlab(name)
      
      if(length(outliers)>0){
        density_plot <- density_plot+
          geom_point(aes(x=outliers),y=0,shape = 5,col="blue")
      }
      return(density_plot)
    }
    
    plot_list<-lapply(input$variables,plotQ28)
    grid.arrange(grobs=plot_list,top=("density plot of Variables"))
  })
  
}

shinyApp(ui=Q28ui, server=Q28server)